Analysis of Global Compact COP reports

For this analysis we download from here all COP reports submitted up to 2017. We only take into account the reports submitted in English.

Please specify below the focus year of this analysis. It will be consider as the end point of historical analyses, as well as the year for annual analyses.

In [3]:
focus_year = "2017"

Please select the focus language using one of the following values:

  • en (English, default option)
  • de (German)
  • es (Spanish)
  • fr (French)
  • pt (Portuguese)
In [80]:
focus_language = 'en'
In [5]:
language_ref = { 'en' : { 'name' : 'English', 'min_coocurrence' : 10},
                 'de' : { 'name' : 'German', 'min_coocurrence' : 2},
                 'es' : { 'name' : 'Spanish', 'min_coocurrence' : 2},
                 'fr' : { 'name' : 'French', 'min_coocurrence' : 2},
                 'pt' : { 'name' : 'Portuguese', 'min_coocurrence' : 2},
               }

1. Gathering information about COP reports available from the UN Global Compact website

The UN Global Compact website contains entries for each COP report, describing the sector of the company submitting the report, country and year, as well as the language in which the repoort was written in and a link to a PDF file with the full report.

The results in this section give a general view of the available COPs, it's not yet restricted by the focus_year and focus_language.

In [6]:
import requests
import re
from bs4 import BeautifulSoup

gc_url = "https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active?page=1&per_page=10"
gc_base_url = "https://www.unglobalcompact.org"

gc_home = requests.get(gc_url)

soup = BeautifulSoup(gc_home.content, 'lxml')

header = soup.h2.string

total_num_cops = re.search(r'(?<=: )[0-9]+', header)[0]
print("Total number of COPs available: %s" % total_num_cops)
Total number of COPs available: 34938
In [7]:
full_gc_url = "https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active?page=1&per_page=" + total_num_cops

print("Getting full list of reports ...")
gc_full_list = requests.get(full_gc_url)

gc_full_list_soup = BeautifulSoup(gc_full_list.content, 'lxml')
Getting full list of reports ...
In [9]:
def check_sdgs_3_13(profile): #checks in SDGs 3 and 13 are selected
    has_sdg3 = "no"
    has_sdg13 = "no"
    questions = profile.find_all("ul", class_='questionnaire')
    if len(questions) == 2:
        sdgs = questions[0].find_all("li")
        if len(sdgs) != 18:  # the correct SDG questionnaire has 17 questions + header
            temp_sdgs = questions[1].find_all("li")
            if len(temp_sdgs) == 18:
                sdgs = temp_sdgs
            else:
                sdgs = []
        if 'selected_question' in sdgs[3].get('class'):
            has_sdg3 = "yes"
        if 'selected_question' in sdgs[13].get('class'):
            has_sdg13 = "yes"
    return (has_sdg3, has_sdg13)

participants = gc_full_list_soup.tbody.find_all("tr")
pdfs = {}

num_pdfs = 0
num_nonpdfs = 0
num_noreport = 0

langregex = re.compile(r'(?<=\()[^\)\(]+(?=\)$)')

print("Getting details of each report ...")
for participant in participants:
    cells = participant.find_all('td')
    sector = cells[1].get_text(strip=True)
    country = cells[2].get_text(strip=True)
    year = cells[3].get_text(strip=True)

    participant_entry_url = gc_base_url + cells[0].a.get('href')
    participant_profile = requests.get(participant_entry_url)
    participant_profile_soup = BeautifulSoup(participant_profile.content, 'lxml')

    (participant_sdgs_3, participant_sdgs_13) = check_sdgs_3_13(participant_profile_soup)

    main_body = participant_profile_soup.find("section", class_='main-content-body')
    list_items = main_body.find_all("li")
    found_report = False
    for li in list_items:
        if li.a:
            link = li.a.get('href')
            if "/system/attachments/" in link:
                if ".pdf" in link:
                    num_pdfs += 1
                    language = langregex.search(li.get_text(strip=True))[0]
                    pdfs[link] = { "sector" : sector, "country" : country, "year" : year, "language" : language, "sdgs3" : participant_sdgs_3, "sdgs13" : participant_sdgs_13}
                    print(".", end='')
                else:
                    num_nonpdfs += 1
                found_report = True
    if not found_report:
        num_noreport += 1
print(" done.")
print("PDFs: %d, non-PDFs: %d, no-report: %d" % (num_pdfs, num_nonpdfs, num_noreport))
Getting details of each report ...
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... done.
PDFs: 34967, non-PDFs: 1253, no-report: 4413

Saving index of reports so that it can be reused

In [12]:
import pandas as pd

reports_index_csv_filename = "../data/cops/reports_index.csv"

df_pdfs = pd.DataFrame.from_dict(pdfs, orient='index')
df_pdfs.to_csv(reports_index_csv_filename, sep='\t', encoding='utf-8')

Possible starting point: This can be used when an index file is available (has been saved previously). Only run this cell if starting from this point, otherwise skip it.

In [ ]:
import pandas as pd

reports_index_csv_filename = "../data/cops/reports_index.csv"

df_pdfs = pd.read_csv(reports_index_csv_filename, sep='\t', encoding='utf-8', index_col=0, dtype={'year': object})
pdfs = df_pdfs.to_dict(orient='index')

In [13]:
countries = {}
sectors = {}
years = {}
languages = {}
sdgs3 = 0
sdgs13 = 0
sdgs3_13 = 0

for pdf in pdfs.keys():
    language = pdfs[pdf]["language"]
    year = pdfs[pdf]["year"]
    country = pdfs[pdf]["country"]
    sector = pdfs[pdf]["sector"]
    sdg3 = pdfs[pdf]["sdgs3"]
    sdg13 = pdfs[pdf]["sdgs13"]

    sectors[sector] = sectors.get(sector,0) + 1
    countries[country] = countries.get(country,0) + 1
    years[year] = years.get(year,0) + 1
    languages[language] = languages.get(language,0) + 1    
    if sdg3 == "yes":
        sdgs3 += 1
    if sdg13 == "yes":
        sdgs13 += 1
    if sdg3 == "yes" and sdg13 == "yes":
        sdgs3_13 += 1
In [14]:
print("Number of reports that include SDG 3 or SDG 13 or both: %d, %d, %d respectively" % (sdgs3, sdgs13, sdgs3_13))
Number of reports that include SDG 3 or SDG 13 or both: 5410, 5036, 3578 respectively
In [15]:
df_languages = pd.DataFrame(sorted(languages.items(), key=lambda k: k[1], reverse=True), columns=["Language", "Number of reports"])
df_languages
Out[15]:
Language Number of reports
0 English 16916
1 Spanish 8849
2 French 3388
3 Portuguese 1398
4 German 1003
5 Japanese 825
6 Korean 431
7 Chinese-Mandarin 354
8 Turkish 276
9 Italian 228
10 Danish 220
11 Russian 203
12 Swedish 142
13 Other 129
14 Polish 83
15 Lithuanian 78
16 Ukrainian 75
17 Arabic 65
18 Dutch 54
19 Serbian 47
20 Bulgarian 44
21 Greek 35
22 Icelandic 31
23 Chinese-Cantonese 29
24 Norwegian 29
25 Catalan 18
26 Thai 7
27 Indonesian 5
28 Vietnamese 3
29 Tagalog 1
30 Swahili-Kiswahili 1
In [16]:
df_countries = pd.DataFrame(sorted(countries.items(), key=lambda k: k[1], reverse=True), columns=["Country", "Number of reports"])
df_countries
Out[16]:
Country Number of reports
0 Spain 4531
1 France 4158
2 Brazil 1664
3 Germany 1583
4 Denmark 1550
5 Japan 1489
6 Mexico 1384
7 Colombia 1316
8 United State... 1208
9 Sweden 1001
10 United Kingdom 994
11 Korea, Repub... 828
12 Argentina 760
13 China 686
14 India 590
15 Turkey 582
16 Netherlands 492
17 Peru 486
18 Switzerland 478
19 Italy 471
20 Australia 425
21 Norway 414
22 Finland 319
23 Singapore 276
24 Panama 275
25 South Africa 275
26 Kenya 267
27 Chile 252
28 Austria 252
29 Greece 243
... ... ...
108 Bermuda 9
109 Benin 9
110 Venezuela 8
111 Senegal 7
112 Malawi 7
113 Zimbabwe 6
114 Haiti 6
115 Albania 6
116 Oman 5
117 Gabon 5
118 Cameroon 5
119 Gambia 5
120 Angola 4
121 Togo 4
122 Cote d'Ivoire 4
123 Sao Tome And... 4
124 Cape Verde 4
125 Montenegro 3
126 Congo, Democ... 2
127 Cambodia 2
128 Seychelles 2
129 Somalia 2
130 Afghanistan 2
131 Honduras 2
132 Papua New Gu... 1
133 South Sudan 1
134 Ethiopia 1
135 Burundi 1
136 Namibia 1
137 Burkina Faso 1

138 rows × 2 columns

In [17]:
df_sectors = pd.DataFrame(sorted(sectors.items(), key=lambda k: k[1], reverse=True), columns=["Sector", "Number of reports"])
df_sectors
Out[17]:
Sector Number of reports
0 Support Services 4921
1 General Industrials 2629
2 Construction & Materials 2349
3 Financial Services 2234
4 Food Producers 1618
5 Software & Computer Services 1616
6 General Retailers 1213
7 Electronic & Electrical Equ... 1118
8 Media 1080
9 Travel & Leisure 923
10 Industrial Engineering 896
11 Industrial Transportation 860
12 Chemicals 855
13 Electricity 847
14 Technology Hardware & Equip... 817
15 Health Care Equipment & Ser... 787
16 Automobiles & Parts 747
17 Industrial Metals & Mining 744
18 Banks 734
19 Personal Goods 732
20 Oil & Gas Producers 682
21 Beverages 652
22 Gas, Water & Multiutilities 617
23 Pharmaceuticals & Biotechno... 595
24 Not Applicable 555
25 Real Estate Investment & Se... 554
26 Forestry & Paper 443
27 Mobile Telecommunications 413
28 Household Goods & Home Cons... 398
29 Fixed Line Telecommunications 382
30 Oil Equipment, Services & D... 310
31 Aerospace & Defense 253
32 Nonlife Insurance 234
33 Mining 234
34 Alternative Energy 217
35 Life Insurance 164
36 Food & Drug Retailers 131
37 Leisure Goods 121
38 Diversified 110
39 Equity Investment Instruments 94
40 Real Estate Investment Trusts 52
41 Tobacco 15
42 Nonequity Investment Instru... 13
43 Telecommunications 7
44 Industrial Goods & Services 1
In [18]:
df_years = pd.DataFrame(sorted(years.items(), reverse=True), columns=["Year", "Number of reports"])
df_years
Out[18]:
Year Number of reports
0 2018 3382
1 2017 5591
2 2016 5296
3 2015 5177
4 2014 4577
5 2013 4558
6 2012 3811
7 2011 2564
8 2010 1
9 2009 4
10 2008 2
11 2007 1
12 2006 1
13 2005 1
14 2004 1

2. Selecting COP reports that match required criteria (up to focus_year, written in focus_language)

In [81]:
selected_sectors = {}
selected_countries = {}
selected_years = {}
selected_countries_years = {}

selected_pdfs = {}

for pdf in pdfs.keys():
    language = pdfs[pdf]["language"]
    year = pdfs[pdf]["year"]
    country = pdfs[pdf]["country"]
    sector = pdfs[pdf]["sector"]

    if language == language_ref[focus_language]['name'] and int(year) <= int(focus_year):
        selected_pdfs[pdf] = pdfs[pdf]
        
        selected_sectors[sector] = selected_sectors.get(sector,0) + 1
        selected_countries[country] = selected_countries.get(country,0) + 1
        selected_years[year] = selected_years.get(year,0) + 1
        if country in selected_countries_years.keys():
            selected_countries_years[country][year] = selected_countries_years[country].get(year,0) + 1
        else:
            selected_countries_years[country] = {year : 1}
In [82]:
print("There are %d reports up to %s written in %s" % (len(selected_pdfs.keys()), focus_year, language_ref[focus_language]['name']))
There are 15216 reports up to 2017 written in English
In [83]:
df_selected_countries = pd.DataFrame(sorted(selected_countries.items(), key=lambda k: k[1], reverse=True), columns=["Country", "Number of reports"])
df_selected_countries
Out[83]:
Country Number of reports
0 Denmark 1172
1 United State... 1054
2 France 935
3 United Kingdom 888
4 Sweden 726
5 Germany 676
6 Japan 609
7 India 556
8 Netherlands 387
9 Australia 382
10 Korea, Repub... 363
11 Switzerland 352
12 Spain 348
13 Brazil 342
14 Norway 335
15 China 306
16 Turkey 287
17 South Africa 261
18 Finland 247
19 Kenya 244
20 Singapore 236
21 Myanmar 214
22 Italy 214
23 Mexico 213
24 Greece 194
25 Canada 178
26 Egypt 154
27 United Arab ... 144
28 Sri Lanka 134
29 Pakistan 128
... ... ...
100 Malawi 7
101 Tanzania, Un... 6
102 Albania 6
103 Uruguay 6
104 Tunisia 6
105 Andorra 5
106 Gambia 5
107 El Salvador 4
108 Cameroon 4
109 Togo 4
110 Oman 4
111 Guatemala 4
112 Zimbabwe 3
113 Angola 2
114 Cambodia 2
115 Somalia 2
116 Afghanistan 2
117 Madagascar 2
118 Congo, Democ... 1
119 Monaco 1
120 South Sudan 1
121 Ethiopia 1
122 Morocco 1
123 Montenegro 1
124 Cote d'Ivoire 1
125 Seychelles 1
126 Sao Tome And... 1
127 Senegal 1
128 Namibia 1
129 Honduras 1

130 rows × 2 columns

In [84]:
df_selected_sectors = pd.DataFrame(sorted(selected_sectors.items(), key=lambda k: k[1], reverse=True), columns=["Sector", "Number of reports"])
df_selected_sectors
Out[84]:
Sector Number of reports
0 Support Services 1678
1 General Industrials 1184
2 Financial Services 1105
3 Construction & Materials 926
4 Software & Computer Services 587
5 Food Producers 584
6 Electronic & Electrical Equ... 579
7 Technology Hardware & Equip... 498
8 General Retailers 484
9 Chemicals 473
10 Travel & Leisure 467
11 Industrial Transportation 465
12 Media 450
13 Industrial Engineering 432
14 Industrial Metals & Mining 406
15 Personal Goods 404
16 Automobiles & Parts 382
17 Oil & Gas Producers 382
18 Beverages 320
19 Banks 320
20 Pharmaceuticals & Biotechno... 305
21 Real Estate Investment & Se... 266
22 Mobile Telecommunications 257
23 Health Care Equipment & Ser... 238
24 Electricity 233
25 Forestry & Paper 209
26 Household Goods & Home Cons... 199
27 Gas, Water & Multiutilities 161
28 Fixed Line Telecommunications 159
29 Aerospace & Defense 152
30 Not Applicable 149
31 Oil Equipment, Services & D... 146
32 Mining 128
33 Alternative Energy 97
34 Nonlife Insurance 78
35 Equity Investment Instruments 78
36 Leisure Goods 56
37 Life Insurance 52
38 Diversified 43
39 Real Estate Investment Trusts 38
40 Food & Drug Retailers 29
41 Nonequity Investment Instru... 10
42 Tobacco 4
43 Telecommunications 2
44 Industrial Goods & Services 1
In [85]:
df_selected_years = pd.DataFrame(sorted(selected_years.items(), reverse=True), columns=["Year", "Number of reports"])
df_selected_years
Out[85]:
Year Number of reports
0 2017 2657
1 2016 2652
2 2015 2451
3 2014 2259
4 2013 2140
5 2012 1774
6 2011 1276
7 2010 1
8 2009 1
9 2008 1
10 2007 1
11 2006 1
12 2005 1
13 2004 1

3. Downloading PDF file for each COP report that matches required criteria

At this time we've only considered reports written in the focus language and submitted up to end of the focus year.

A folder should be specified as the location where PDFs will be downloaded to ('pdfs_folder' variable below).

If this process has been run before and some files are already available in the specified folder, they won't be downloaded again.

In [86]:
pdfs_folder = "../data/cops/pdfs/"
In [87]:
filenameregex = re.compile(r'(?<=/)[^$/]+(?=$)')
In [ ]:
import PyPDF2
import shutil
import nltk
import os
#import os.path

try:
    os.stat(pdfs_folder)
except:
    os.mkdir(pdfs_folder) 

for pdf in selected_pdfs.keys():
    filename = pdfs_folder + filenameregex.search(pdf)[0]

    if not os.path.isfile(filename):
        print("Saving %s" % (filename))
        file = requests.get(gc_base_url + pdf, stream=True)
        try:
            with open(filename, 'wb') as out_file:
                shutil.copyfileobj(file.raw, out_file)
            del file
        except:
            print("Could not save %s" % (filename))
            continue
    else:
        print("Skipping %s, PDF already available in folder" % (filename))

4. Extracting text from the PDF file of each report

A folder should be specified as the location where text files will be saved at ('txts_folder' variable below).

This process may fail to extract the text from some PDF files.

If this process has been run before and some text files are already available in the specified folder, they won't be processed again.

In [138]:
txts_folder = "../data/cops/txts/"
In [ ]:
try:
    os.stat(txts_folder)
except:
    os.mkdir(txts_folder) 

for pdf in selected_pdfs.keys():
    filename = pdfs_folder + filenameregex.search(pdf)[0]
    filenametxt = txts_folder + filenameregex.search(pdf)[0] + ".txt"
    if not os.path.isfile(filenametxt):
        print("Loading %s" % (filename))
        try:
            pdfFileObj = open(filename, 'rb')
            txtFileObj = open(filenametxt, 'w')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            num_pages = pdfReader.numPages
        except:
            print("Couldn't load %s" % (filename))
            continue
    
        print("Extracting text from %s" % (filename))
        for num_page in range(0,num_pages):
            try:
                pageObj = pdfReader.getPage(num_page)
                txtFileObj.write(pageObj.extractText())
            except:
                print("Couldn't extract txt %s, page %d" % (filename, num_page))
                continue
        pdfFileObj.close()
        txtFileObj.close()
    else:
        print("Skipping %s, TXT already available in folder" % (filename))        

5. Analysing the text of reports: searching for health and climate mentions

Keywords

In [91]:
import json
import os
import re

keywords_file = os.path.join("..", "keywords", focus_language + ".json")

climate_dict = []
health_dict = []
compound_terms = []

def normalise_keywords(dictionary): #lowercases and handles compounds
    for i in range(0, len(dictionary)):
        keyword = dictionary[i].lower()
        compound = keyword.replace(' ','_')
        if compound != keyword:
            keyword = compound
            words = tuple(compound.split('_'))
            compound_terms.append(words)
        dictionary[i] = keyword
    return dictionary

def generate_hashtags(dictionary):
    hashtags_dict = []
    for keyword in dictionary:
        hashtags_dict.append("#" + keyword.replace('_',''))
    return hashtags_dict

with open(keywords_file) as f:
    data = json.load(f)

climate_dict = normalise_keywords(data['climate'])
health_dict = normalise_keywords(data['health'])

climate_hashtag_dict = generate_hashtags(climate_dict)
health_hashtag_dict = generate_hashtags(health_dict)
In [92]:
health_dict
Out[92]:
['malaria',
 'diarrhoea',
 'infection',
 'disease',
 'sars',
 'measles',
 'pneumonia',
 'epidemic',
 'pandemic',
 'public_health',
 'healthcare',
 'epidemiology',
 'health_care',
 'health',
 'mortality',
 'morbidity',
 'nutrition',
 'illness',
 'infectious',
 'ncd',
 'non-communicable_disease',
 'noncommunicable_disease',
 'communicable_disease',
 'air_pollution',
 'nutrition',
 'malnutrition',
 'mental_disorder',
 'stunting']
In [93]:
climate_dict
Out[93]:
['climate_change',
 'global_warming',
 'green_house',
 'temperature',
 'extreme_weather',
 'global_environmental_change',
 'climate_variability',
 'greenhouse',
 'low_carbon',
 'ghge',
 'renewable_energy',
 'carbon_emission',
 'co2_emission',
 'climate_pollutant']
In [94]:
who_regions = {}

who_regions["Africa"] = ["Algeria", "Angola", "Benin", "Botswana", "British Indian Ocean Territory", 
                         "Burkina Faso", "Burundi", "Cabo Verde", "Cameroon", "Central African Republic", 
                         "Chad", "Comoros", "Congo, Democratic Republic of the", "Cote d'Ivoire", 
                         "Democratic Republic of the Congo", "Congo, Republic of the", 
                         "Equatorial Guinea", "Eritrea", "Ethiopia", "French Southern Territories", "Gabon", 
                         "Gambia", "Ghana", "Guinea", "Guinea-Bissau", "Kenya", "Lesotho", "Liberia", 
                         "Madagascar", "Malawi", "Mali", "Mauritania", "Mauritius", "Mayotte", "Mozambique", 
                         "Namibia", "Niger", "Nigeria", "Rwanda", "Réunion", "Saint Helena", 
                         "Sao Tome And Principe", "Senegal", "Seychelles", "Sierra Leone", "South Africa", 
                         "South Sudan", "Swaziland", "Togo", "Uganda", "Tanzania, United Republic of", 
                         "Western Sahara", "Zambia", "Zimbabwe"]

who_regions["Eastern Mediterranean"] = ["Afghanistan", "Bahrain", "Djibouti", "Egypt", 
                                        "Iran, Islamic Republic of", "Iraq", "Jordan", "Kuwait", "Lebanon", 
                                        "Libya", "Morocco", "Oman", "Pakistan", "Qatar", "Saudi Arabia", 
                                        "Somalia", "Palestine, State of", "Sudan", "Syrian Arab Republic", 
                                        "Tunisia", "United Arab Emirates", "Yemen"]

who_regions["Europe"] = ["Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus", "Belgium", 
                         "Bosnia-Herzegovina", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark", 
                         "Estonia", "Faroe Islands", "Finland", "France", "Georgia", "Germany", "Gibraltar", 
                         "Greece", "Guernsey", "Holy See", "Hungary", "Iceland", "Ireland", "Isle of Man", 
                         "Israel", "Italy", "Jersey", "Kazakhstan", "Kyrgyzstan", "Latvia", "Liechtenstein", 
                         "Lithuania", "Luxembourg", "Malta", "Monaco", "Montenegro", "Netherlands", "Norway", 
                         "Poland", "Portugal", "Moldova, Republic of", "Romania", "Russian Federation", 
                         "San Marino", "Sark", "Serbia", "Slovakia", "Slovenia", "Spain", 
                         "Svalbard and Jan Mayen Islands", "Sweden", "Switzerland", "Tajikistan", 
                         "Macedonia, The former Yugoslav Republic of", "Turkey", "Turkmenistan", "Ukraine", 
                         "United Kingdom", "Uzbekistan", 
                         "Ã…land Islands"]

who_regions["Latin America and the Caribbean"] = ["Anguilla", "Antigua and Barbuda", "Argentina", "Aruba", 
                                                  "Bahamas", "Barbados", "Belize", "Bolivia", "Bonaire", 
                                                  "Bouvet Island", "Brazil", "British Virgin Islands", 
                                                  "Cayman Islands", "Chile", "Colombia", "Costa Rica", "Cuba", 
                                                  "Curaçao", "Dominica", "Dominican Republic", "Ecuador", 
                                                  "El Salvador", "Falkland Islands (Malvinas)", "French Guiana", 
                                                  "Grenada", "Guadeloupe", "Guatemala", "Guyana", "Haiti", 
                                                  "Honduras", "Jamaica", "Martinique", "Mexico", "Montserrat", 
                                                  "Nicaragua", "Panama", "Paraguay", "Peru", "Puerto Rico", 
                                                  "Saint BarthÕ©lemy", "Saint Kitts and Nevis", "Saint Lucia", 
                                                  "Saint Martin", "Saint Vincent and the Grenadines", 
                                                  "Sint Maarten", "South Georgia and the South Sandwich Islands", 
                                                  "Suriname", "Trinidad And Tobago", "Turks and Caicos Islands", 
                                                  "Virgin Islands", "Uruguay", "Venezuela"]

who_regions["North America"] = ["Bermuda", "Canada", "Greenland", "Saint Pierre and Miquelon", 
                                "United States of America"]

who_regions["South-East Asia"] = ["Bangladesh", "Bhutan", "Democratic People's Republic of Korea", "India", 
                                  "Indonesia", "Maldives", "Myanmar", "Nepal", "Sri Lanka", "Thailand", 
                                  "Timor-Leste"]

who_regions["Western Pacific"] = ["American Samoa", "Australia", "Brunei", "Cambodia", "China", 
                                  "Hong Kong", "Macao", "Taiwan", "Christmas Island", "Cocos (Keeling) Islands", 
                                  "Cook Islands", "Fiji", "French Polynesia", "Guam", 
                                  "Heard Island and McDonald Islands", "Japan", "Kiribati", 
                                  "Laos", "Malaysia", "Marshall Islands", "Micronesia", "Mongolia", "Nauru", 
                                  "New Caledonia", "New Zealand", "Niue", "Norfolk Island", 
                                  "Northern Mariana Islands", "Palau", "Papua New Guinea", "Philippines", 
                                  "Pitcairn", "Korea, Republic of", "Samoa", "Singapore", "Solomon Islands", 
                                  "Tokelau", "Tonga", "Tuvalu", "Minor Outlying Islands", "Vanuatu", 
                                  "Viet Nam", "Wallis and Futuna Islands"]

def get_who_region(country):
    for region in who_regions:
        if country in who_regions[region]:
            return region
    if "..." in country:
        abrev_country_name = re.search(r'(?<=^)[^\.]+', country)[0]
        for region in who_regions:
            for c in who_regions[region]:
                if re.match(abrev_country_name, c):
                    return region
    print("Country not found among WHO regions: %s" % country)
    return False

selected_regions_years = {}
for region in who_regions.keys():
    selected_regions_years[region] = {}
for country in selected_countries_years.keys():
    region = get_who_region(country)
    if region:
        for year in selected_countries_years[country].keys():
            selected_regions_years[region][year] = selected_regions_years[region].get(year, 0) + selected_countries_years[country][year]
In [ ]:
from nltk.tokenize import MWETokenizer

def get_context(index, wordlist):
    lowest_index = max(0, index-25)
    highest_index = min(index+1+25, len(wordlist))
    return wordlist[lowest_index:index] + wordlist[index+1:highest_index]

tokenizer = MWETokenizer(compound_terms)
regex = re.compile(r'^.{1,3}$') #words with 3 or less chars

types_count = {}
tokens_count = 0

per_sector = {}
average_per_sector = {}
proportion_per_sector = {}
per_country = {}
per_country_focusyear = {}
average_per_country_focusyear = {}
proportion_per_country_focusyear = {}
per_year = {}
average_per_year = {}
proportion_per_year = {}
per_region = {}
per_region_year_intersection = {}
average_per_region_year_intersection = {}
proportion_per_region_year_intersection = {}
histogram_number_of_mentions = {}
histogram_tokens_count = []

global_count_health_keywords = {}
global_health_contexts = []
global_count_climate_keywords = {}
global_climate_contexts = []
global_intersection_contexts = []

cooccurrence_matrix = {}

for termset in ["health", "climate", "intersection"]:
    per_sector[termset] = {}
    average_per_sector[termset] = {}
    proportion_per_sector[termset] = {}
    per_country[termset] = {}
    per_country_focusyear[termset] = {}
    average_per_country_focusyear[termset] = {}
    proportion_per_country_focusyear[termset] = {}
    per_year[termset] = {}
    average_per_year[termset] = {}
    proportion_per_year[termset] = {}
    per_region[termset] = {}
    histogram_number_of_mentions[termset] = {}
    
for region in who_regions.keys():
    per_region_year_intersection[region] = {}
    average_per_region_year_intersection[region] = {}
    proportion_per_region_year_intersection[region] = {}

for pdf in selected_pdfs.keys():
    filenametxt = txts_folder + filenameregex.search(pdf)[0] + ".txt"
    print("Loading %s" % (filenametxt))

    try:
        txtFileObj = open(filenametxt, 'r')
    except:
        continue

    wordlist = re.split(r'[\W0-9]+', txtFileObj.read().lower())
    tokens_count += len(wordlist)
    histogram_tokens_count.append(len(wordlist))
    
    for word in wordlist:
        types_count[word] = types_count.get(word, 0) + 1
    
    compounds_wordlist = tokenizer.tokenize(wordlist)
    filtered_compounds_wordlist = [w for w in compounds_wordlist if (len(w) > 3)]

    health_contexts = []
    climate_contexts = []
    health_words = []
    
    for i in range(0,len(filtered_compounds_wordlist)):
        word = filtered_compounds_wordlist[i]
        if word in health_dict:
            context = get_context(i, filtered_compounds_wordlist)
            health_contexts.append(context)
            health_words.append(word)            
            global_count_health_keywords[word] = global_count_health_keywords.get(word, 0) + 1
            global_health_contexts.extend(context)
            
        if word in climate_dict:
            context = get_context(i, filtered_compounds_wordlist)
            climate_contexts.append(context)
            global_count_climate_keywords[word] = global_count_climate_keywords.get(word, 0) + 1
            global_climate_contexts.extend(context)

    total_health_mentions = len(health_contexts)
    
    total_climate_mentions = len(climate_contexts)
        
    total_intersection_mentions = 0
    for i in range(0, len(health_contexts)):
        mention = health_contexts[i]
        hword = health_words[i]
        if hword not in cooccurrence_matrix.keys():
            cooccurrence_matrix[hword] = {}        
        for cword in climate_dict:
            if cword in mention:
                total_intersection_mentions += 1
                global_intersection_contexts.extend(mention)
                cooccurrence_matrix[hword][cword] = cooccurrence_matrix[hword].get(cword, 0) + 1                
        
    language = selected_pdfs[pdf]["language"]
    year = selected_pdfs[pdf]["year"]
    country = selected_pdfs[pdf]["country"]
    sector = selected_pdfs[pdf]["sector"]
    region = get_who_region(country)
    
    histogram_number_of_mentions["health"][total_health_mentions] = histogram_number_of_mentions["health"].get(total_health_mentions, 0) + 1
    histogram_number_of_mentions["climate"][total_climate_mentions] = histogram_number_of_mentions["climate"].get(total_climate_mentions, 0) + 1
    histogram_number_of_mentions["intersection"][total_intersection_mentions] = histogram_number_of_mentions["intersection"].get(total_intersection_mentions, 0) + 1
    
    per_sector["health"][sector] = per_sector["health"].get(sector,0) + total_health_mentions
    per_sector["climate"][sector] = per_sector["climate"].get(sector,0) + total_climate_mentions
    per_sector["intersection"][sector] = per_sector["intersection"].get(sector,0) + total_intersection_mentions
    
    per_country["health"][country] = per_country["health"].get(country,0) + total_health_mentions
    per_country["climate"][country] = per_country["climate"].get(country,0) + total_climate_mentions
    per_country["intersection"][country] = per_country["intersection"].get(country,0) + total_intersection_mentions

    if year == focus_year:
        per_country_focusyear["health"][country] = per_country_focusyear["health"].get(country,0) + total_health_mentions
        per_country_focusyear["climate"][country] = per_country_focusyear["climate"].get(country,0) + total_climate_mentions
        per_country_focusyear["intersection"][country] = per_country_focusyear["intersection"].get(country,0) + total_intersection_mentions

    if region != False:
        per_region["health"][region] = per_region["health"].get(region,0) + total_health_mentions
        per_region["climate"][region] = per_region["climate"].get(region,0) + total_climate_mentions
        per_region["intersection"][region] = per_region["intersection"].get(region,0) + total_intersection_mentions
        per_region_year_intersection[region][year] = per_region_year_intersection[region].get(year,0) + total_intersection_mentions

    per_year["health"][year] = per_year["health"].get(year,0) + total_health_mentions
    per_year["climate"][year] = per_year["climate"].get(year,0) + total_climate_mentions
    per_year["intersection"][year] = per_year["intersection"].get(year,0) + total_intersection_mentions

    if total_health_mentions > 0:
        proportion_per_year["health"][year] = proportion_per_year["health"].get(year,0) + 1
        proportion_per_sector["health"][sector] = proportion_per_sector["health"].get(sector,0) + 1
        if year == focus_year:
            proportion_per_country_focusyear["health"][country] = proportion_per_country_focusyear["health"].get(country,0) + 1

    if total_climate_mentions > 0:
        proportion_per_year["climate"][year] = proportion_per_year["climate"].get(year,0) + 1
        proportion_per_sector["climate"][sector] = proportion_per_sector["climate"].get(sector,0) + 1
        if year == focus_year:
            proportion_per_country_focusyear["climate"][country] = proportion_per_country_focusyear["climate"].get(country,0) + 1
    if total_intersection_mentions > 0:
        proportion_per_year["intersection"][year] = proportion_per_year["intersection"].get(year,0) + 1        
        proportion_per_sector["intersection"][sector] = proportion_per_sector["intersection"].get(sector,0) + 1        
        if year == focus_year:
            proportion_per_country_focusyear["intersection"][country] = proportion_per_country_focusyear["intersection"].get(country,0) + 1
        proportion_per_region_year_intersection[region][year] = proportion_per_region_year_intersection[region].get(year,0) + 1

for year in selected_years.keys():
    average_per_year["health"][year] = per_year["health"][year]/selected_years[year]
    average_per_year["climate"][year] = per_year["climate"][year]/selected_years[year]
    average_per_year["intersection"][year] = per_year["intersection"][year]/selected_years[year]

    proportion_per_year["health"][year] = proportion_per_year["health"].get(year,0)/selected_years[year] * 100
    proportion_per_year["climate"][year] = proportion_per_year["climate"].get(year,0)/selected_years[year] * 100
    proportion_per_year["intersection"][year] = proportion_per_year["intersection"].get(year,0)/selected_years[year] * 100

for country in selected_countries_years.keys():
    if focus_year in selected_countries_years[country].keys():
        average_per_country_focusyear["health"][country] = per_country_focusyear["health"].get(country, 0)/selected_countries_years[country][focus_year]
        average_per_country_focusyear["climate"][country] = per_country_focusyear["climate"].get(country, 0)/selected_countries_years[country][focus_year]
        average_per_country_focusyear["intersection"][country] = per_country_focusyear["intersection"].get(country, 0)/selected_countries_years[country][focus_year]

        proportion_per_country_focusyear["health"][country] = proportion_per_country_focusyear["health"].get(country,0)/selected_countries_years[country][focus_year] * 100
        proportion_per_country_focusyear["climate"][country] = proportion_per_country_focusyear["climate"].get(country,0)/selected_countries_years[country][focus_year] * 100
        proportion_per_country_focusyear["intersection"][country] = proportion_per_country_focusyear["intersection"].get(country,0)/selected_countries_years[country][focus_year] * 100
    
for region in selected_regions_years.keys():
    for year in selected_regions_years[region].keys():
        average_per_region_year_intersection[region][year] = per_region_year_intersection[region].get(year,0)/selected_regions_years[region][year]
        proportion_per_region_year_intersection[region][year] = proportion_per_region_year_intersection[region].get(year,0)/selected_regions_years[region][year] * 100        
    
for sector in selected_sectors.keys():
    average_per_sector["health"][sector] = per_sector["health"][sector]/selected_sectors[sector]
    average_per_sector["climate"][sector] = per_sector["climate"][sector]/selected_sectors[sector]
    average_per_sector["intersection"][sector] = per_sector["intersection"][sector]/selected_sectors[sector]

    proportion_per_sector["health"][sector] = proportion_per_sector["health"].get(sector,0)/selected_sectors[sector] * 100
    proportion_per_sector["climate"][sector] = proportion_per_sector["climate"].get(sector,0)/selected_sectors[sector] * 100
    proportion_per_sector["intersection"][sector] = proportion_per_sector["intersection"].get(sector,0)/selected_sectors[sector] * 100

Corpus numbers

In [96]:
print("Number of tokens: %d" % tokens_count)
print("Number of types: %d" % len(types_count.keys()))
print("Average number of tokens per report: %d" % int(tokens_count/len(selected_pdfs.keys())))
print("Median number of tokens per report: %d" % sorted(histogram_tokens_count)[int(len(selected_pdfs.keys())/2)])
Number of tokens: 147476375
Number of types: 1401478
Average number of tokens per report: 9692
Median number of tokens per report: 2318

6. Visualisyng the result of the text analysis

Histogram: Logarithm of number of mentions

Each bar corresponds to number of reports with x number of mentions

In [97]:
df_histogram_number_of_mentions = pd.DataFrame(data=histogram_number_of_mentions)
df_histogram_number_of_mentions.filter(items=list(range(0,100)),axis=0).plot.bar(logy=True, figsize=(20,5))
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0797ab7ba8>

References per year: total, average, proportion, per WHO region

In [98]:
df_per_year = pd.DataFrame(data=per_year)
df_per_year
Out[98]:
climate health intersection
2004 39 54 1
2005 41 69 2
2006 43 95 6
2007 60 64 1
2008 114 94 0
2009 14 8 0
2010 0 0 0
2011 9339 15368 473
2012 11198 20174 476
2013 11890 25459 637
2014 13269 30014 711
2015 13344 28245 734
2016 14931 30193 905
2017 16237 32319 1063
In [99]:
ax = df_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Total number of references")
ax
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072bc3c518>
In [100]:
ax = df_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["climate","health"],axis=1).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Total number of references")
ax
Out[100]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0725ba1240>
In [101]:
ax = df_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["intersection"],axis=1).plot.line(figsize=(15,5), color="green", legend=False)
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Total number of references")
ax
Out[101]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072b096748>
In [102]:
df_per_region_year_intersection = pd.DataFrame(data=per_region_year_intersection)
ax = df_per_region_year_intersection.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Total number of references")
ax
Out[102]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072b50de10>
In [103]:
df_average_per_year = pd.DataFrame(data=average_per_year)
df_average_per_year
Out[103]:
climate health intersection
2004 39.000000 54.000000 1.000000
2005 41.000000 69.000000 2.000000
2006 43.000000 95.000000 6.000000
2007 60.000000 64.000000 1.000000
2008 114.000000 94.000000 0.000000
2009 14.000000 8.000000 0.000000
2010 0.000000 0.000000 0.000000
2011 7.318966 12.043887 0.370690
2012 6.312289 11.372041 0.268320
2013 5.556075 11.896729 0.297664
2014 5.873838 13.286410 0.314741
2015 5.444308 11.523868 0.299470
2016 5.630090 11.384992 0.341252
2017 6.111027 12.163718 0.400075
In [104]:
ax = df_average_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Average number of references")
ax
Out[104]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072b49f828>
In [105]:
ax = df_average_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["climate","health"],axis=1).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Average number of references")
ax
Out[105]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072b4cb898>
In [106]:
ax = df_average_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["intersection"],axis=1).plot.line(figsize=(15,5), color="green", legend=False)
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Average number of references")
ax
Out[106]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072b4cb7b8>
In [107]:
df_average_per_region_year_intersection = pd.DataFrame(data=average_per_region_year_intersection)
ax = df_average_per_region_year_intersection.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Average number of references")
ax
Out[107]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072afc45f8>
In [108]:
df_proportion_per_year = pd.DataFrame(data=proportion_per_year)
df_proportion_per_year
Out[108]:
climate health intersection
2004 100.000000 100.000000 100.000000
2005 100.000000 100.000000 100.000000
2006 100.000000 100.000000 100.000000
2007 100.000000 100.000000 100.000000
2008 100.000000 100.000000 0.000000
2009 100.000000 100.000000 0.000000
2010 0.000000 0.000000 0.000000
2011 43.025078 53.291536 12.225705
2012 44.081172 55.749718 12.175874
2013 42.056075 55.934579 11.495327
2014 43.249225 57.724657 11.553785
2015 42.390861 57.935537 12.525500
2016 42.345400 56.711916 13.499246
2017 44.222808 57.508468 14.076026
In [109]:
ax = df_proportion_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Proportion of reports (%)")
ax.set_ylim(ymin=0)
ax
Out[109]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072b289fd0>
In [110]:
ax = df_proportion_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["climate","health"],axis=1).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Proportion of reports (%)")
ax.set_ylim(ymin=0)
ax
Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072afd3160>
In [111]:
ax = df_proportion_per_year.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).filter(items=["intersection"],axis=1).plot.line(figsize=(15,5), color="green", legend=False)
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Proportion of reports (%)")
ax.set_ylim(ymin=0)
ax
Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072af47cf8>
In [112]:
df_proportion_per_region_year_intersection = pd.DataFrame(data=proportion_per_region_year_intersection)
ax = df_proportion_per_region_year_intersection.filter(items=['2011','2012','2013','2014','2015','2016','2017'],axis=0).plot.line(figsize=(15,5))
ax.set_xticklabels([0, '2011','2012','2013','2014','2015','2016','2017'])
ax.set_xlabel("Year")
ax.set_ylabel("Proportion of reports (%)")
ax.set_ylim(ymin=0)
ax
Out[112]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072b93edd8>

References per sector: total, average, proportion

In [113]:
df_per_sector = pd.DataFrame(data=per_sector)
df_per_sector
Out[113]:
climate health intersection
Aerospace & Defense 851 1833 58
Alternative Energy 1382 1067 69
Automobiles & Parts 2466 5169 134
Banks 2167 2836 58
Beverages 2242 4419 175
Chemicals 4878 12297 512
Construction & Materials 5595 10758 312
Diversified 494 1141 16
Electricity 3691 3517 153
Electronic & Electrical Equ... 2126 3997 77
Equity Investment Instruments 105 788 7
Financial Services 8754 9220 283
Fixed Line Telecommunications 1191 2774 63
Food & Drug Retailers 265 561 9
Food Producers 3707 10724 281
Forestry & Paper 1650 2108 53
Gas, Water & Multiutilities 2087 1977 66
General Industrials 5927 12366 376
General Retailers 2475 4323 95
Health Care Equipment & Ser... 545 5358 79
Household Goods & Home Cons... 1085 2021 38
Industrial Engineering 1713 3962 139
Industrial Goods & Services 0 0 0
Industrial Metals & Mining 2193 6553 136
Industrial Transportation 1903 3701 72
Leisure Goods 266 388 7
Life Insurance 209 939 4
Media 1243 2654 28
Mining 1120 3913 70
Mobile Telecommunications 2369 4570 156
Nonequity Investment Instru... 30 105 0
Nonlife Insurance 209 1076 4
Not Applicable 695 1345 60
Oil & Gas Producers 5877 8818 339
Oil Equipment, Services & D... 632 1596 44
Personal Goods 1084 2875 72
Pharmaceuticals & Biotechno... 1186 11614 222
Real Estate Investment & Se... 1425 2771 49
Real Estate Investment Trusts 680 1431 85
Software & Computer Services 2098 3632 94
Support Services 4018 8184 184
Technology Hardware & Equip... 5459 8935 260
Telecommunications 2 5 1
Tobacco 19 41 0
Travel & Leisure 2406 3794 69
In [114]:
ax = df_per_sector.loc[(df_per_sector['climate'] > 0) | (df_per_sector['health'] > 0)].plot.bar(stacked=True,figsize=(15,5))
ax.set_xlabel("Sector")
ax.set_ylabel("Total number of references")
ax
Out[114]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f072b67bb00>
In [115]:
df_average_per_sector = pd.DataFrame(data=average_per_sector)
df_average_per_sector
Out[115]:
climate health intersection
Aerospace & Defense 5.598684 12.059211 0.381579
Alternative Energy 14.247423 11.000000 0.711340
Automobiles & Parts 6.455497 13.531414 0.350785
Banks 6.771875 8.862500 0.181250
Beverages 7.006250 13.809375 0.546875
Chemicals 10.312896 25.997886 1.082452
Construction & Materials 6.042117 11.617711 0.336933
Diversified 11.488372 26.534884 0.372093
Electricity 15.841202 15.094421 0.656652
Electronic & Electrical Equ... 3.671848 6.903282 0.132988
Equity Investment Instruments 1.346154 10.102564 0.089744
Financial Services 7.922172 8.343891 0.256109
Fixed Line Telecommunications 7.490566 17.446541 0.396226
Food & Drug Retailers 9.137931 19.344828 0.310345
Food Producers 6.347603 18.363014 0.481164
Forestry & Paper 7.894737 10.086124 0.253589
Gas, Water & Multiutilities 12.962733 12.279503 0.409938
General Industrials 5.005912 10.444257 0.317568
General Retailers 5.113636 8.931818 0.196281
Health Care Equipment & Ser... 2.289916 22.512605 0.331933
Household Goods & Home Cons... 5.452261 10.155779 0.190955
Industrial Engineering 3.965278 9.171296 0.321759
Industrial Goods & Services 0.000000 0.000000 0.000000
Industrial Metals & Mining 5.401478 16.140394 0.334975
Industrial Transportation 4.092473 7.959140 0.154839
Leisure Goods 4.750000 6.928571 0.125000
Life Insurance 4.019231 18.057692 0.076923
Media 2.762222 5.897778 0.062222
Mining 8.750000 30.570312 0.546875
Mobile Telecommunications 9.217899 17.782101 0.607004
Nonequity Investment Instru... 3.000000 10.500000 0.000000
Nonlife Insurance 2.679487 13.794872 0.051282
Not Applicable 4.664430 9.026846 0.402685
Oil & Gas Producers 15.384817 23.083770 0.887435
Oil Equipment, Services & D... 4.328767 10.931507 0.301370
Personal Goods 2.683168 7.116337 0.178218
Pharmaceuticals & Biotechno... 3.888525 38.078689 0.727869
Real Estate Investment & Se... 5.357143 10.417293 0.184211
Real Estate Investment Trusts 17.894737 37.657895 2.236842
Software & Computer Services 3.574106 6.187394 0.160136
Support Services 2.394517 4.877235 0.109654
Technology Hardware & Equip... 10.961847 17.941767 0.522088
Telecommunications 1.000000 2.500000 0.500000
Tobacco 4.750000 10.250000 0.000000
Travel & Leisure 5.152034 8.124197 0.147752
In [116]:
ax = df_average_per_sector.loc[(df_average_per_sector['climate'] > 0) | (df_average_per_sector['health'] > 0)].plot.bar(stacked=True,figsize=(20,5))
ax.set_xlabel("Sector")
ax.set_ylabel("Average number of references")
ax
Out[116]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0742990048>
In [117]:
df_proportion_per_sector = pd.DataFrame(data=proportion_per_sector)
df_proportion_per_sector
Out[117]:
climate health intersection
Aerospace & Defense 46.052632 60.526316 9.210526
Alternative Energy 62.886598 60.824742 23.711340
Automobiles & Parts 34.293194 58.900524 14.659686
Banks 50.312500 60.000000 10.937500
Beverages 51.875000 60.937500 17.187500
Chemicals 56.448203 66.807611 29.386892
Construction & Materials 43.628510 54.859611 14.038877
Diversified 60.465116 72.093023 16.279070
Electricity 65.665236 69.098712 26.180258
Electronic & Electrical Equ... 28.842832 42.314335 6.908463
Equity Investment Instruments 23.076923 42.307692 5.128205
Financial Services 51.945701 53.936652 10.769231
Fixed Line Telecommunications 52.830189 66.037736 15.723270
Food & Drug Retailers 62.068966 58.620690 17.241379
Food Producers 47.089041 63.698630 15.924658
Forestry & Paper 52.153110 58.851675 14.354067
Gas, Water & Multiutilities 48.447205 50.931677 11.801242
General Industrials 39.104730 50.168919 10.726351
General Retailers 36.157025 49.586777 10.743802
Health Care Equipment & Ser... 26.470588 63.025210 10.504202
Household Goods & Home Cons... 36.180905 56.281407 9.045226
Industrial Engineering 39.814815 53.472222 12.731481
Industrial Goods & Services 0.000000 0.000000 0.000000
Industrial Metals & Mining 46.798030 63.300493 15.763547
Industrial Transportation 40.430108 54.193548 7.311828
Leisure Goods 35.714286 51.785714 8.928571
Life Insurance 46.153846 48.076923 5.769231
Media 36.888889 53.333333 4.000000
Mining 44.531250 60.156250 22.656250
Mobile Telecommunications 60.311284 73.540856 26.459144
Nonequity Investment Instru... 90.000000 100.000000 0.000000
Nonlife Insurance 41.025641 52.564103 2.564103
Not Applicable 31.543624 55.033557 11.409396
Oil & Gas Producers 62.041885 73.036649 31.937173
Oil Equipment, Services & D... 41.780822 61.643836 17.123288
Personal Goods 34.900990 57.178218 6.435644
Pharmaceuticals & Biotechno... 40.983607 67.868852 17.704918
Real Estate Investment & Se... 46.992481 51.503759 11.654135
Real Estate Investment Trusts 60.526316 65.789474 44.736842
Software & Computer Services 34.582624 56.047700 6.814310
Support Services 34.922527 51.728248 6.436234
Technology Hardware & Equip... 47.791165 59.638554 16.867470
Telecommunications 50.000000 50.000000 50.000000
Tobacco 25.000000 50.000000 0.000000
Travel & Leisure 46.038544 60.171306 8.993576
In [118]:
ax = df_proportion_per_sector.loc[(df_proportion_per_sector['climate'] > 0) | (df_proportion_per_sector['health'] > 0)].plot.bar(stacked=False,figsize=(20,5), title="Proportion of reports with mentions per sector")
ax.set_xlabel("Sector")
ax.set_ylabel("Proportion of reports (%)")
ax
Out[118]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f07427872b0>

References per country: total

In [119]:
df_per_country = pd.DataFrame(data=per_country)
df_per_country.loc[(df_per_country['climate'] > 0) | (df_per_country['health'] > 0)]
Out[119]:
climate health intersection
Andorra 5 52 0
Angola 13 166 1
Argentina 209 432 8
Armenia 18 6 0
Australia 2566 3488 117
Austria 903 1453 19
Azerbaijan 3 15 0
Bahrain 38 289 3
Bangladesh 122 570 10
Belarus 64 847 1
Belgium 691 843 22
Bermuda 238 779 75
Bolivia 2 3 2
Bosnia-Herze... 0 36 0
Brazil 2831 5315 104
Bulgaria 17 240 2
Cambodia 0 1 0
Canada 1579 2662 76
Chile 182 314 2
China 2267 3545 145
Colombia 280 759 17
Costa Rica 38 144 2
Croatia 962 1345 47
Cyprus 24 106 3
Czechia 0 9 0
Denmark 2818 7814 139
Dominican Re... 63 70 6
Ecuador 0 6 0
Egypt 237 1057 13
El Salvador 31 19 0
... ... ... ...
Romania 75 299 16
Russian Fede... 253 1649 20
Saudi Arabia 84 343 15
Serbia 123 251 4
Sierra Leone 0 21 0
Singapore 1041 2141 67
Slovakia 97 236 2
Slovenia 17 88 1
Somalia 0 7 0
South Africa 2079 5072 117
South Sudan 0 1 0
Spain 2661 4407 133
Sri Lanka 792 2571 36
Sudan 6 72 0
Sweden 3977 6628 151
Switzerland 1481 3393 120
Syrian Arab ... 31 111 0
Tanzania, Un... 0 2 0
Thailand 1062 1458 54
Trinidad And... 5 57 0
Tunisia 7 13 0
Turkey 2993 5163 158
Uganda 7 188 1
Ukraine 106 260 6
United Arab ... 144 1008 8
United Kingdom 4838 9508 333
United State... 7934 19110 682
Uruguay 0 1 0
Viet Nam 9 71 1
Zimbabwe 1 8 0

116 rows × 3 columns

In [120]:
ax = df_per_country.loc[(df_per_country['climate'] > 0) | (df_per_country['health'] > 0)].plot.bar(stacked=True,figsize=(20,10))
ax.set_xlabel("Country")
ax.set_ylabel("Total number of references")
ax
Out[120]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0742645160>

References per WHO region: total

In [121]:
df_per_region = pd.DataFrame(data=per_region)
df_per_region
Out[121]:
climate health intersection
Africa 2482 8453 180
Eastern Mediterranean 1033 4804 78
Europe 44016 90764 2271
Latin America and the Caribbean 5344 10745 202
North America 9751 22551 833
South-East Asia 5078 12463 317
Western Pacific 22815 32376 1128
In [122]:
ax = df_per_region.plot.bar(stacked=True,figsize=(20,10))
ax.set_xlabel("WHO Region")
ax.set_ylabel("Total number of references")
ax
Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f074253fbe0>

References per country in 2017: total, average, proportion

Download shapefiles ne_110m_admin_0_countries.* from here

In [123]:
import matplotlib.pyplot as plt
import matplotlib.cm
import numpy as np

from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.colors import Normalize

def create_map(per_country_counts, title, resolution='c'):

    mapped_country_names = {'Bosnia-Herze...' : 'Bosnia and Herz.', "Cote d'Ivoire" : "Côte d'Ivoire", 
                            'Congo, Democ...' : 'Dem. Rep. Congo', 'Dominican Re...' : 'Dominican Rep.', 
                            'Iran, Islami...' : 'Iran', 'Macedonia, T...' : 'Macedonia', 
                            'Moldova, Rep...' : 'Moldova', 'Palestine, S...' : 'Palestine', 
                            'Russian Fede...' : 'Russia', 'South Sudan' : 'S. Sudan', 
                            'Korea, Repub...' : 'South Korea' , 'Syrian Arab ...' : 'Syria', 
                            'Tanzania, Un...' : 'Tanzania', 'Trinidad And...' : 'Trinidad and Tobago', 
                            'United Arab ...' : 'United Arab Emirates', 'United State...' : 'United States of America',
                            'Viet Nam' : 'Vietnam'}

    per_normalised_country_counts = []
    for country, count in per_country_counts.items():
        if country in mapped_country_names:
            per_normalised_country_counts.append((mapped_country_names[country], count))
        else:
            per_normalised_country_counts.append((country, count))
    df_per_normalised_country_counts = pd.DataFrame(per_normalised_country_counts, columns=["country", "count"])

    fig, ax = plt.subplots(figsize=(30,20))
    plt.title(title)

    m = Basemap(resolution=resolution, # c, l, i, h, f or None
                projection='cyl',
                lat_0=0, lon_0=0,
                llcrnrlon=-170, llcrnrlat=-60, urcrnrlon=190, urcrnrlat=84)

    m.drawmapboundary(fill_color='#aaccec')
    m.fillcontinents(color='#f2f2f2',lake_color='#aaccec')
    m.drawcoastlines()
    m.readshapefile("ne_110m_admin_0_countries", "countries")
    shapename_regex = re.compile(r'(?<=^)[\w \.\']+')

    df_poly = pd.DataFrame({
        'shapes': [Polygon(np.array(shape), True) for shape in m.countries],
        'country': [shapename_regex.search(area['NAME'])[0] for area in m.countries_info]
    })
    df_poly = df_poly.merge(df_per_normalised_country_counts, on='country', how='left', validate="many_to_one")

    cmap = plt.get_cmap('YlOrRd')   
    pc = PatchCollection(df_poly.shapes, zorder=2)
    norm = Normalize()
 
    pc.set_facecolor(cmap(norm(df_poly['count'].fillna(0).values)))
    ax.add_collection(pc)

    mapper = matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap)
    mapper.set_array(df_poly['count'])

    plt.colorbar(mapper, shrink=0.5)


create_map(per_country_focusyear["health"], "2017 - Global Compact reports - Health - Total number of references")
In [124]:
create_map(per_country_focusyear["climate"], "2017 - Global Compact reports - Climate - Total number of references")
In [125]:
create_map(per_country_focusyear["intersection"], "2017 - Global Compact reports - Health&Climate intersection - Total number of references")
In [126]:
create_map(average_per_country_focusyear["health"], "2017 - Global Compact reports - Health - Average number of references")
In [127]:
create_map(average_per_country_focusyear["climate"], "2017 - Global Compact reports - Climate - Average number of references")
In [128]:
create_map(average_per_country_focusyear["intersection"], "2017 - Global Compact reports - Health&Climate intersection - Average number of references")
In [129]:
create_map(proportion_per_country_focusyear["health"], "2017 - Global Compact reports - Health - Proportion of reports")
In [130]:
create_map(proportion_per_country_focusyear["climate"], "2017 - Global Compact reports - Climate - Proportion of reports")
In [131]:
create_map(proportion_per_country_focusyear["intersection"], "2017 - Global Compact reports - Health&Climate intersection - Proportion of reports")

Frenquency of keywords: health and climate

In [132]:
df_health_keywords = pd.DataFrame(sorted(global_count_health_keywords.items(), key=lambda k: k[1], reverse=True), columns=["Keyword", "Number of mentions"])
df_health_keywords
Out[132]:
Keyword Number of mentions
0 health 139464
1 healthcare 12645
2 nutrition 6421
3 disease 5201
4 health_care 4937
5 illness 4173
6 mortality 2117
7 air_pollution 1701
8 public_health 1580
9 malaria 1110
10 infectious 642
11 infection 541
12 malnutrition 507
13 sars 348
14 pandemic 234
15 epidemic 206
16 morbidity 137
17 pneumonia 51
18 measles 41
19 diarrhoea 34
20 epidemiology 27
21 stunting 18
22 communicable_disease 14
23 mental_disorder 7
In [133]:
df_climate_keywords = pd.DataFrame(sorted(global_count_climate_keywords.items(), key=lambda k: k[1], reverse=True), columns=["Keyword", "Number of mentions"])
df_climate_keywords
Out[133]:
Keyword Number of mentions
0 greenhouse 30835
1 climate_change 27416
2 renewable_energy 15693
3 low_carbon 5331
4 temperature 4550
5 global_warming 3975
6 green_house 1061
7 carbon_emission 824
8 extreme_weather 728
9 ghge 86
10 climate_variability 16
11 climate_pollutant 2
12 global_environmental_change 2

Word clouds: health, climate, intersection

Up to 200 most frequent words that appear in the context of our health or climate keywords or both

In [134]:
import collections
from wordcloud import WordCloud, STOPWORDS
from stop_words import get_stop_words

threshold = 200

language_specific_stopwords = get_stop_words(focus_language)

def create_wordcloud(contexts, stopwords=[]):
    most_frequent_words = {}
    stopwords.extend(STOPWORDS)
    stopwords.extend(language_specific_stopwords)

    context_unigrams = collections.Counter(contexts)
    for word, freq in sorted(context_unigrams.items(), key=lambda k: k[1], reverse=True)[0:threshold]:
        if word not in stopwords:
            most_frequent_words[word] = freq

    wordcloud = WordCloud(background_color="white", scale=10).generate_from_frequencies(most_frequent_words)

    fig = plt.figure(1, figsize=(20, 12))
    plt.axis('off')
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()
    

create_wordcloud(global_health_contexts, health_dict)
In [135]:
create_wordcloud(global_climate_contexts, climate_dict)
In [136]:
create_wordcloud(global_intersection_contexts, climate_dict + health_dict)

Network graphs: intersection

Blue nodes are health keywords and green nodes are climate keywords. The closer the nodes are to each other, the more often the words co-occur. The graph only shows links between words that co-occurred 10 or more times in the whole corpus.

In [137]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np


G = nx.Graph()

health_nodes = []
climate_nodes = []

for word1 in cooccurrence_matrix.keys():
    for word2 in cooccurrence_matrix[word1].keys():
        if cooccurrence_matrix[word1][word2] >= language_ref[focus_language]['min_coocurrence']:
            G.add_edge(word1, word2, weight=cooccurrence_matrix[word1][word2])
            health_nodes.append(word1)
            climate_nodes.append(word2)

plt.figure(figsize=(15,15))

pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist=health_nodes, node_size=1000, node_color='b')
nx.draw_networkx_nodes(G, pos, nodelist=climate_nodes, node_size=1000, node_color='g')
nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True), width=2)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')
#edge_labels = nx.get_edge_attributes(G,'weight')
#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10, font_family='sans-serif')

plt.axis('off')
plt.show()